Чтение, слияние, редактирование исходных датасетов

info_healthy <- read_xlsx ("data/raw/final_health_statistic.xlsx")
info_ibs <- read_xlsx ("data/raw/final_ibs_141_statistic.xlsx")
## Уникальные переменные в info_ibs:
## Main_Disease
## Weight_kg
## Height_cm
## BMI
## Diet_type
## Diet_duration
## Additive_usage
## Уникальные переменные в info_healthy:
## Drugs

combined_info

  • Объединение info_healthy и info_ibs в единый датасет combined_info c редактированием содержимого
combined_info <- info_healthy %>% 
  bind_rows(info_ibs) %>%
  mutate (
    BMI_min = ifelse (is.na (BMI_min), round (Weight_min /(Height_max/100 * Height_max/100), 2), BMI_min),
    BMI_max = ifelse (is.na (BMI_max), round (Weight_max /(Height_min/100 * Height_min/100), 2), BMI_max)
    ) %>% 
  unite("BMI_range", BMI_min, BMI_max, sep = "-", na.rm = TRUE) %>%
  unite ("Age_range", Age_min, Age_max, sep = "-", na.rm = TRUE) %>%
  mutate(
    Age_range = case_when(
      Age_range == "18-40" | Age_range == "23-28" | Age_range == "16-42" | Age_range == "21-43" ~ "16-43",
      Age <= 43 ~ "16-43",
      Age > 43 ~ "> 43",
      TRUE ~ NA_character_), #удалена группа 28-54
    research_ID = sub ("research_", "", research_ID),
    research_ID = case_when(
      research_ID == 0 ~ 1,
      research_ID == 1 ~ 2,
      research_ID == 2 ~ 3,
      research_ID == 3 ~ 4,
      research_ID == 4 ~ 5, 
      research_ID == 6 ~ 6, 
      research_ID == 7 ~ 7), 
    patient_ID = row_number(),
    Sex = ifelse (Sex == "mixed", NA, Sex),
    Smoking = sub ("never", "Never",  Smoking),
    Smoking = case_when(
      Smoking == "No" ~ "No",
      Smoking == "Never" ~ "No",
      Smoking == "Rarely (a few times/month)" ~ "Yes", #5 чел.
      Smoking == "Occasionally (1-2 times/week)" ~ "Yes",  #3 чел.
      Smoking == "Regularly (3-5 times/week)" ~ "Yes",  #1 чел.
      Smoking == "Daily" ~ "Yes"), #7 чел.
    Alcohol = sub ("rarely", "Rarely", Alcohol),
    Alcohol = ifelse(Alcohol == "Regularly (3-5 times/week)"|
                       Alcohol == "Daily", #3 чел.
                     "Regularly (3-7 times/week)", Alcohol),
    Antibiotics_usage = case_when(
      Antibiotics_usage == "Month" | Antibiotics_usage == ~ "3 months" |
        Antibiotics_usage == "6 months" ~ "1-6 months",
        #2 чел. больных Month, 0 чел. здоровых 3 months, 0 чел. здоровых для 6 months
      Antibiotics_usage == "Year" | Antibiotics_usage == "Not use"  ~ 
        "12 months/Not use"), # 0 чел. из здоровых для 6-12 months, 0 чел. больных для Not use
    Hygiene = case_when(
      Hygiene == "Occasionally (1-2 times/week) cosmetics" ~ "Occasionally cosmetics (1-2 times/week)",
      Hygiene == "Rarely (a few times/month) cosmetics" ~ "Rarely cosmetics (a few times/month)",
      TRUE ~ Hygiene),
    Hygiene = case_when(
      Hygiene == "Daily cosmetics"|Hygiene == "Regularly cosmetics (3-5 times/week)" ~ "Regularly (3-7 times/week)", #0 чел. больных  для 3-5 times/week
      Hygiene == "Occasionally cosmetics (1-2 times/week)" | Hygiene == "Rarely cosmetics (a few times/month)" ~ "Occasionally (a few-8 times/month)",
      #только 2 чел. больных для 1-2 times/week
      Hygiene == "Never cosmetics" ~ "Never"),
    Physical_activity = sub ("regularly", "Regularly",  Smoking),
    BMI = ifelse (is.na(Weight_kg), BMI, Weight_kg/ (Height_cm/100 * Height_cm/100)),
    BMI_range = ifelse(BMI_range == "", NA, BMI_range),
    BMI_category = case_when(
      BMI_range == "18-25" ~ "normal/overweight",
      BMI_range == "19.21-29.29" ~ "normal/overweight",
      BMI_range == "20.6-29.6" ~ "normal/overweight",
      BMI_range == "21.74-28.38" ~ "normal/overweight",
      BMI_range == "18.5-30.8" ~ "normal/overweight", #немного больше 30
      BMI < 18.5 ~ "underweight",
      BMI >= 18.5 & BMI < 30 ~ "normal/overweight",
      BMI >= 30 ~ "obese")
    ) %>% 
  rename ("Cosmetics" = Hygiene ) %>% 
  
  mutate_if (is.character, as.factor) %>% 
  
  select(-c(
    Instrument, # unique (combined_info$Instrument) = "Illumina MiSeq" 
    Isolation_source, # unique (combined_info$Isolation_source) = "faeces" 
    Assay_type, # unique (combined_info$Assay_type) = "AMPLICON"
    Target_gene, # unique (combined_info$Target_gene) = "16S"
    Main_Disease, # unique (combined_info$Main_Disease) = NA (for healthy), 141 (for ibs)
    Drugs, # unique (combined_info$Drugs) = NA
    Social_status, # unique (combined_info$Social_status) =  NA, urban 
    Weight_kg, Height_cm, Weight_min, Weight_max, Height_min, Height_max, BMI_range, #использованы для создания BMI_category, уменьшения количества NA в BMI
    BMI, # NA у всех здоровых
    Birth_Year, # имеются значения только в тех случаях, где возраст уже известен
    Pets_type # только cat и NA
  ))

rm (info_healthy, info_ibs)
summary (combined_info)
##   research_ID      patient_ID  Seq_region     Seq_date     Health_state
##  Min.   :1.000   Min.   :  1   V3-V4:106   Min.   :2015   Disease:170  
##  1st Qu.:3.000   1st Qu.: 96   V4   :229   1st Qu.:2017   Health :211  
##  Median :4.000   Median :191   V5-V6: 46   Median :2018                
##  Mean   :4.194   Mean   :191               Mean   :2019                
##  3rd Qu.:6.000   3rd Qu.:286               3rd Qu.:2021                
##  Max.   :7.000   Max.   :381               Max.   :2023                
##                                                                        
##       Age        Age_range       Sex         Country  
##  Min.   :19.00   > 43 : 44   female: 79   Spain  :95  
##  1st Qu.:27.25   16-43:242   male  :112   Poland :70  
##  Median :34.00   NA's : 95   NA's  :190   Belgium:46  
##  Mean   :38.71                            Germany:45  
##  3rd Qu.:47.00                            Italy  :37  
##  Max.   :76.00                            Austria:28  
##  NA's   :247                              (Other):60  
##                         Race     Smoking                             Alcohol   
##  African American         :  1   No  :187   Never                        : 12  
##  Asian or Pacific Islander:  1   Yes : 16   Occasionally (1-2 times/week): 24  
##  Caucasian                :116   NA's:178   Rarely (a few times/month)   : 36  
##  Hispanic                 :  1              Regularly (3-7 times/week)   : 15  
##  Other                    : 11              NA's                         :294  
##  NA's                     :251                                                 
##                                                                                
##          Antibiotics_usage Physical_activity  Travel_period
##  1-6 months       : 48     No  :187          3 months: 18  
##  12 months/Not use:129     Yes : 16          6 months: 10  
##  NA's             :204     NA's:178          Month   : 13  
##                                              Year    : 45  
##                                              NA's    :295  
##                                                            
##                                                            
##          Education_level                              Cosmetics  
##  Bachelor's level: 31    Never                             : 44  
##  Master's level  : 28    Occasionally (a few-8 times/month): 17  
##  School Level    : 25    Regularly (3-7 times/week)        : 25  
##  NA's            :297    NA's                              :295  
##                                                                  
##                                                                  
##                                                                  
##    Sleep_duration Diet_type  Diet_duration Additive_usage
##  > 8 hours:  9    10.0: 13   3.0 :  8      22.0:  1      
##  4-6 hours:  9    5.0 :  5   6.0 :  5      23.0:  1      
##  6-7 hours: 31    NA's:363   NA's:368      4.0 :  9      
##  7-8 hours: 38                             NA's:370      
##  NA's     :294                                           
##                                                          
##                                                          
##             BMI_category
##  normal/overweight:287  
##  obese            :  5  
##  underweight      :  1  
##  NA's             : 88  
##                         
##                         
## 
  • Таблица с количеством субъектов (n) в зависимости от исследования, даты, наличия/отсутствия СРК
combined_info %>% 
  select (research_ID, Country, Seq_date, Health_state) %>% 
  group_by(research_ID, Country, Seq_date, Health_state) %>% 
  summarise(n = n()) %>% 
  flextable() %>% theme_box() %>% 
  merge_v(c ("research_ID", "Country", "Seq_date", "Health_state")) %>% 
  align(align = "center", part = "all")

research_ID

Country

Seq_date

Health_state

n

1

Belgium

2,018

Health

46

2

Italy

36

3

Poland

2,023

70

4

Australia

2,021

6

Austria

2,020

3

Germany

33

2,021

12

Greece

1

Hungary

2

Ireland

Disease

1

Israel

Health

1

Italy

1

Norway

2,018

Disease

1

UK

7

2,020

2

2,021

3

USA

2,018

6

2,020

1

2,021

5

2,022

1

2,023

1

5

Austria

2,017

25

6

Israel

2,022

22

7

Spain

2,015

95

tbl_summary of combined_info

library(gtsummary)

combined_info %>% 
  select(! c(patient_ID,
              Diet_duration, Additive_usage, Diet_type #у всех здоровых данные переменные = NA
              )) %>% 
  tbl_summary(digits = list(all_continuous() ~ c(0, 0),
                            all_categorical() ~ c(0, 0)),
              by = Health_state) %>%
  add_p()
Characteristic Disease, N = 1701 Health, N = 2111 p-value2
research_ID <0.001
    1 0 (0%) 46 (22%)
    2 0 (0%) 36 (17%)
    3 0 (0%) 70 (33%)
    4 28 (16%) 59 (28%)
    5 25 (15%) 0 (0%)
    6 22 (13%) 0 (0%)
    7 95 (56%) 0 (0%)
Seq_region <0.001
    V3-V4 0 (0%) 106 (50%)
    V4 170 (100%) 59 (28%)
    V5-V6 0 (0%) 46 (22%)
Seq_date <0.001
    2015 95 (56%) 0 (0%)
    2017 25 (15%) 0 (0%)
    2018 14 (8%) 82 (39%)
    2020 3 (2%) 36 (17%)
    2021 9 (5%) 23 (11%)
    2022 23 (14%) 0 (0%)
    2023 1 (1%) 70 (33%)
Age 42 (32, 50) 28 (26, 37) <0.001
    Unknown 95 152
Age_range <0.001
    > 43 34 (45%) 10 (5%)
    16-43 41 (55%) 201 (95%)
    Unknown 95 0
Sex 0.15
    female 35 (48%) 44 (37%)
    male 38 (52%) 74 (63%)
    Unknown 97 93
Country
    Australia 0 (0%) 6 (3%)
    Austria 25 (15%) 3 (1%)
    Belgium 0 (0%) 46 (22%)
    Germany 0 (0%) 45 (21%)
    Greece 0 (0%) 1 (0%)
    Hungary 0 (0%) 2 (1%)
    Ireland 1 (1%) 0 (0%)
    Israel 22 (13%) 1 (0%)
    Italy 0 (0%) 37 (18%)
    Norway 1 (1%) 0 (0%)
    Poland 0 (0%) 70 (33%)
    Spain 95 (56%) 0 (0%)
    UK 12 (7%) 0 (0%)
    USA 14 (8%) 0 (0%)
Race 0.4
    African American 0 (0%) 1 (1%)
    Asian or Pacific Islander 0 (0%) 1 (1%)
    Caucasian 27 (100%) 89 (86%)
    Hispanic 0 (0%) 1 (1%)
    Other 0 (0%) 11 (11%)
    Unknown 143 108
Smoking 4 (14%) 12 (7%) 0.2
    Unknown 142 36
Alcohol 0.002
    Never 4 (14%) 8 (14%)
    Occasionally (1-2 times/week) 4 (14%) 20 (34%)
    Rarely (a few times/month) 9 (32%) 27 (46%)
    Regularly (3-7 times/week) 11 (39%) 4 (7%)
    Unknown 142 152
Antibiotics_usage 0.020
    1-6 months 2 (8%) 46 (30%)
    12 months/Not use 23 (92%) 106 (70%)
    Unknown 145 59
Physical_activity 4 (14%) 12 (7%) 0.2
    Unknown 142 36
Travel_period 0.066
    3 months 3 (11%) 15 (26%)
    6 months 1 (4%) 9 (16%)
    Month 4 (14%) 9 (16%)
    Year 20 (71%) 25 (43%)
    Unknown 142 153
Education_level <0.001
    Bachelor's level 5 (18%) 26 (46%)
    Master's level 18 (64%) 10 (18%)
    School Level 5 (18%) 20 (36%)
    Unknown 142 155
Cosmetics 0.6
    Never 15 (56%) 29 (49%)
    Occasionally (a few-8 times/month) 6 (22%) 11 (19%)
    Regularly (3-7 times/week) 6 (22%) 19 (32%)
    Unknown 143 152
Sleep_duration 0.2
    > 8 hours 4 (14%) 5 (8%)
    4-6 hours 4 (14%) 5 (8%)
    6-7 hours 12 (43%) 19 (32%)
    7-8 hours 8 (29%) 30 (51%)
    Unknown 142 152
BMI_category 0.012
    normal/overweight 135 (96%) 152 (100%)
    obese 5 (4%) 0 (0%)
    underweight 1 (1%) 0 (0%)
    Unknown 29 59
1 n (%); Median (IQR)
2 Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test
  • Загрузка датасетов с данными о микробиоме здоровых людей (bacteria_healthy) и пациентов с СРК (bacteria_ibs) и их объединение в combined_bacteria
bacteria_healthy <- read_csv("data/raw/final_bacteria_health.csv")
bacteria_ibs <- read_csv("data/raw/final_bacteria_ibs_141.csv")

combined_bacteria <- bacteria_healthy %>% 
  bind_rows (bacteria_ibs) %>% 
  mutate(patient_ID = row_number())

rm (bacteria_healthy, bacteria_ibs)
  • Объединение combined_bacteria и combined_info в data_wide
data_wide <- combined_info %>% 
  left_join (combined_bacteria)

Оценка доли NA и нулевых значений

data_wide %>% 
  select (where (function(x) sum (is.na(x))/ nrow(data_wide) * 100 > 0)) %>% 
  sapply (function(x) sum (is.na(x))/ nrow(data_wide) * 100) %>% round(1) %>% 
  as.data.frame() %>% 
  rename(NA_percentage = ".") %>% 
  mutate (
    "Number of people with known data" = round (nrow(data_wide) - NA_percentage/100 * nrow(data_wide)),
    NA_percentage = paste (NA_percentage, "%", sep = " ")
    ) %>% 
  arrange(desc (NA_percentage)) %>% 
  rownames_to_column() %>% 
  as_tibble() %>% flextable()

rowname

NA_percentage

Number of people with known data

Additive_usage

97.1 %

11

Diet_duration

96.6 %

13

Diet_type

95.3 %

18

Education_level

78 %

84

Travel_period

77.4 %

86

Cosmetics

77.4 %

86

Alcohol

77.2 %

87

Sleep_duration

77.2 %

87

Race

65.9 %

130

Age

64.8 %

134

Antibiotics_usage

53.5 %

177

Sex

49.9 %

191

Smoking

46.7 %

203

Physical_activity

46.7 %

203

Age_range

24.9 %

286

BMI_category

23.1 %

293

# Устанавливаем порог процента 
threshold_percent <- 95 
 
# Функция для вычисления процента записей, не равных NA и не равных 0, для каждой колонки 
calculate_percentage <- function(col) { 
  sum(!is.na(col) & col != 0) / length(col) * 100 
} 
 
# Применяем функцию к каждой колонке в датасете 
percentage_non_zero_non_na <- sapply(data_wide[, -1], calculate_percentage) 
 
# Создаем датафрейм с результатами 
result_df_sort <- data.frame( 
  column = names(percentage_non_zero_non_na), 
  percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>% 
  arrange(desc(percentage))
 
# Отфильтровываем колонки, у которых процент записей менее threshold_percent% 
filtered_columns <- result_df_sort[result_df_sort$percentage < threshold_percent, ]

# Сохраним датасет в excel для дальнейшего анализа
write.xlsx(filtered_columns, 
           file = "data/originals/percentage_by_vars.xlsx")

# Перезапись data_wide с выбором колонок с процентом NA/0 менее threshold_percent 
data_wide <- data_wide %>% 
  select (row.names(filtered_columns), research_ID)

rm (calculate_percentage, result_df_sort)
# Устанавливаем порог процента 
threshold_percent <- 95
 
# Рассчитываем процент значений, не являющихся NA и не равных 0, для каждого пациента 
percentage_non_zero_non_na <- rowMeans(!is.na(data_wide) & data_wide != 0, na.rm = TRUE) * 100 
 
# Создаем датафрейм с результатами 
result_df_sort <- data.frame( 
  patient_id = data_wide$patient_ID, 
  percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>% arrange(desc(percentage))

# Отфильтровываем пациентов, у которых процент значений менее threshold_percent% 
filtered_patients <- result_df_sort[result_df_sort$percentage < threshold_percent, ] 

# Сохраним датасет в excel для дальнейшего анализа 
write.xlsx(filtered_patients,
           file = "data/originals/percentage_by_patient.xlsx") 

# Перезапись data_wide с удалением строк с процентом NA/0 более threshold_percent 
data_wide <- data_wide %>% 
  slice (filtered_patients$patient_id) #при threshold_percent = 95%, изменения data_wide не происходит, так как нет пациентов с процентом NA/0 более 95%

rm (percentage_non_zero_non_na, result_df_sort)
data_wide <- data_wide %>% 
  select (patient_ID, any_of (colnames(combined_info)), everything()) %>% 
  arrange(patient_ID)

write_rds(data_wide, 
          file = "data/originals/data_wide.rds")

data_long

  • Перевод data_wide в длинный формат (data_long), загрузка bacterial_functions, объединение data_long и bacterial_functions
# Создание датасета в длинном формате
data_long <- data_wide %>% 
  pivot_longer(ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")),
               names_to = "Taxon", values_to = "Percentage")

# Чтение листов Excel-файла с функциями бактерий и их объединение
path <- "data/raw/Bacterial group functions.xlsx"
taxon <- c ("TaxonName", "Rank")

bac_functions <- read_xlsx (path, 1) %>% 
  full_join(read_xlsx (path, 2), by = taxon) %>%
  full_join(read_xlsx (path, 3), by = taxon) %>%
  full_join(read_xlsx (path, 4), by = taxon) %>% 
  full_join(read_xlsx (path, 5), by = taxon) %>%
  full_join(read_xlsx (path, 6), by = taxon) %>%
  full_join(read_xlsx (path, 7), by = taxon) %>%
  unite("Taxon", TaxonName, Rank, sep = "_") %>% 
  unite ("Habbit", Habbit, Habit_state, sep = "_") %>% 
  mutate (Habbit = sub ("NA_NA", NA, Habbit)) %>%
  pivot_wider (names_from = Habbit, values_from = Habbit) %>% 
  mutate ("NA" = NULL) %>%
  unite ("Neuromediator_destroy", Neuromediator, Destroy, sep = "_") %>% 
  mutate (Neuromediator_destroy = sub ("NA_NA", NA, Neuromediator_destroy)) %>% 
  pivot_wider(names_from = Neuromediator_destroy, values_from = Neuromediator_destroy) %>%
  mutate ("NA" = NULL) %>% 
  pivot_wider (names_from = Vitamin, values_from = Vitamin) %>% 
  mutate ("NA" = NULL) %>%
  unchop(everything()) %>% 
  unique() %>% #удаление повторяющихся строк
  filter (Taxon != "Blautia obeum_S") %>% #для данного таксона противоречивая информация в Продуценты КЦЖК
  mutate (across (
    (3:31), function (x) ifelse (is.na(x), NA, 1))) %>% 
  mutate_all(as.factor)

rm (path, taxon)

#Перезапись data_long с добавлением функций бактерий
data_long <- data_long %>% 
  left_join (bac_functions, by = "Taxon")

#Сохранение data_long.rds
write_rds(data_long, 
          file = "data/originals/data_long.rds",
          compress = "gz") 
  • Создание отдельных длинных датасетов для каждого таксона
G_long <- data_long %>% subset(grepl("_G", Taxon)) 
# %>% select (where (function(x) sum (is.na(x))/ nrow(.) * 100 < threshold_percent))
F_long <- data_long %>% subset(grepl("_F", Taxon))
C_long <- data_long %>% subset(grepl("_C", Taxon))
O_long <- data_long %>% subset(grepl("_O", Taxon))
P_long <- data_long %>% subset(grepl("_P", Taxon))

write_rds(G_long, 
          file = "data/originals/G_long.rds", "gz") 
write_rds(F_long, 
          file = "data/originals/F_long.rds", "gz") 
write_rds(C_long, 
          file = "data/originals/C_long.rds", "gz") 
write_rds(O_long, 
          file = "data/originals/O_long.rds", "gz") 
write_rds(P_long, 
          file = "data/originals/P_long.rds", "gz") 

Факторный анализ

# Создание нового датасета для сохранения результатов
result_dataset <- data.frame(
  Variable_Name = character(),
  Test_Type = character(),
  P_Value = numeric(),
  Normal_Distribution = character(),
  stringsAsFactors = FALSE
)

alpha = 0.05

# Объединяем датасеты по patient_id 
combined_data <- data_wide %>%
  select(Health_state,
    ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")))

combined_bacteria_clean <- combined_data

# Получаем список переменных из датасета combined_bacteria, исключая "patient_ID"
values_to_exclude <- c("patient_ID", "Seq_date", "Age")
variable_names <- setdiff(names(combined_bacteria_clean), values_to_exclude)

# Проходим по каждой переменной
for (variable in variable_names) {
  # Фильтрация данных (исключаем строки с NA и 0 в текущей переменной)
  filtered_data <- combined_data[!is.na(combined_data[[variable]]) & combined_data[[variable]] != 0, ]
  #print(variable)
  #print(filtered_data)

  # Проверим что датасет filtered_data не пустой и что количество групп сравнения более 1, в нашем случае их 2 :)
  if (nrow(filtered_data) > 0 & length(unique(filtered_data$Health_state)) > 1) {
    
  # Check if there is sufficient variability in the data
  if (length(unique(filtered_data$Health_state)) > 1) {
    # Проверка на нормальность
    shapiro_test_result <- try(shapiro.test(filtered_data[[variable]]), silent = TRUE)
    if (inherits(shapiro_test_result, "try-error")) {
      warning(paste("Skipping Shapiro-Wilk test for variable", variable, "due to an error."))
      next
    }
    p_value_shapiro <- shapiro_test_result$p.value

    # Выбор соответствующего статистического теста
    if (p_value_shapiro > 0.05) {
      # Если нормальное распределение, провести дисперсионный анализ
      model <- aov(filtered_data[[variable]] ~ Health_state, data = filtered_data)
      summary_list <- summary(model)
      test_type <- "ANOVA"
    } else {
      # Если не нормальное распределение, использовать тест Краскела-Уоллиса
      tryCatch({
        model <- kruskal.test(filtered_data[[variable]] ~ Health_state, data = filtered_data)
        test_type <- "Kruskal-Wallis"
      }, error = function(e) {
        warning(paste("Skipping Kruskal-Wallis test for variable", variable, "due to an error:", conditionMessage(e)))
        next
      })
    }

    # Добавление результатов выбранного теста в датасет
    result_dataset <- rbind(result_dataset, 
                            data.frame(Variable_Name = variable,
                                       Test_Type = test_type,
                                       P_Value = ifelse(test_type == "ANOVA", summary_list[[1]]$"Pr(>F)"[1], model$p.value),
                                       Normal_Distribution = ifelse(p_value_shapiro > 0.05, "Yes", "No")))
  } else {
    # If there is no variability, skip the tests
    warning(paste("Skipping tests for variable", variable, "as there is not enough variability in the data."))
  }
  }}
## Warning: Skipping Shapiro-Wilk test for variable Health_state due to an error.
# Добавление столбца с поправкой Бонферрони
result_dataset$Adjusted_P_Value_Bonferroni <- p.adjust(result_dataset$P_Value, method = "bonferroni")

# Добавление столбца с поправкой Холма
result_dataset$Adjusted_P_Value_Holm <- p.adjust(result_dataset$P_Value, method = "holm")

# Добавление столбца с поправкой Бенджамини-Хохберга
result_dataset$Adjusted_P_Value_BH <- p.adjust(result_dataset$P_Value, method = "BH")

result_dataset$test_pass = ifelse(result_dataset$Adjusted_P_Value_Bonferroni < alpha & result_dataset$Adjusted_P_Value_Holm < alpha & result_dataset$Adjusted_P_Value_BH < alpha, "Y", "N")

result_dataset_pass <- result_dataset %>% filter(test_pass == "Y")

# Вывод результатов
print(result_dataset_pass)
##                               Variable_Name      Test_Type      P_Value
## 1                               Eukaryota_D Kruskal-Wallis 3.641890e-07
## 2                            Nitrospirota_P Kruskal-Wallis 7.764816e-07
## 3                          Armatimonadota_P Kruskal-Wallis 5.555616e-10
## 4                         Acidobacteriota_P Kruskal-Wallis 3.112329e-09
## 5                         Patescibacteria_P Kruskal-Wallis 1.463219e-15
## 6                           Cyanobacteria_P Kruskal-Wallis 6.542425e-06
## 7                             Chloroflexi_P Kruskal-Wallis 6.316248e-23
## 8                       Verrucomicrobiota_P Kruskal-Wallis 8.425194e-09
## 9                        Actinobacteriota_P Kruskal-Wallis 6.600394e-23
## 10                         Proteobacteria_P Kruskal-Wallis 2.752566e-06
## 11                   Desulfitobacteriales_O Kruskal-Wallis 5.530423e-05
## 12                           Pirellulales_O Kruskal-Wallis 3.196572e-09
## 13                         Anaerolineales_O Kruskal-Wallis 3.746162e-09
## 14                        Xanthomonadales_O Kruskal-Wallis 1.874109e-06
## 15                         Pedosphaerales_O Kruskal-Wallis 1.302641e-15
## 16                      Acholeplasmatales_O Kruskal-Wallis 1.552118e-08
## 17                                SBR1031_O Kruskal-Wallis 3.378675e-19
## 18                     Sphingobacteriales_O Kruskal-Wallis 5.724693e-10
## 19                       Flavobacteriales_O Kruskal-Wallis 8.296231e-08
## 20                       Staphylococcales_O Kruskal-Wallis 2.450391e-21
## 21                           Monoglobales_O Kruskal-Wallis 2.153674e-06
## 22                      Bifidobacteriales_O Kruskal-Wallis 1.699744e-15
## 23                        Chitinophagales_O Kruskal-Wallis 5.210248e-17
## 24                             Bacillales_O Kruskal-Wallis 7.441037e-10
## 25             Clostridia vadinBB60 group_O Kruskal-Wallis 9.747928e-08
## 26                       Coriobacteriales_O Kruskal-Wallis 4.680932e-22
## 27                          Clostridiales_O Kruskal-Wallis 2.333755e-14
## 28                     Erysipelotrichales_O Kruskal-Wallis 7.511336e-10
## 29                        Lactobacillales_O Kruskal-Wallis 9.824748e-11
## 30    Peptostreptococcales-Tissierellales_O Kruskal-Wallis 8.737004e-17
## 31                        Oscillospirales_O Kruskal-Wallis 4.294684e-05
## 32                              vadinHA49_C Kruskal-Wallis 4.885983e-06
## 33                     Desulfitobacteriia_C Kruskal-Wallis 5.530423e-05
## 34                         Microgenomatia_C Kruskal-Wallis 2.221509e-05
## 35                        Dehalococcoidia_C Kruskal-Wallis 1.999580e-05
## 36                          Phycisphaerae_C Kruskal-Wallis 3.006549e-07
## 37                         Planctomycetes_C Kruskal-Wallis 4.398873e-05
## 38                           Rhodothermia_C Kruskal-Wallis 5.134815e-09
## 39                         Cyanobacteriia_C Kruskal-Wallis 2.599878e-06
## 40                           Anaerolineae_C Kruskal-Wallis 1.126492e-19
## 41                       Verrucomicrobiae_C Kruskal-Wallis 2.572550e-09
## 42                         Coriobacteriia_C Kruskal-Wallis 4.373663e-22
## 43                         Actinobacteria_C Kruskal-Wallis 1.748252e-19
## 44                                Bacilli_C Kruskal-Wallis 6.906521e-12
## 45                    Gammaproteobacteria_C Kruskal-Wallis 2.793530e-05
## 46                          Pirellulaceae_F Kruskal-Wallis 3.196572e-09
## 47                        Anaerolineaceae_F Kruskal-Wallis 3.746162e-09
## 48                          Neisseriaceae_F Kruskal-Wallis 1.969734e-05
## 49                        Microscillaceae_F Kruskal-Wallis 3.202746e-09
## 50                        Pedosphaeraceae_F Kruskal-Wallis 1.302641e-15
## 51                                    A4b_F Kruskal-Wallis 5.664201e-18
## 52                     Acholeplasmataceae_F Kruskal-Wallis 1.552118e-08
## 53                                UCG-011_F Kruskal-Wallis 1.038745e-06
## 54                       Selenomonadaceae_F Kruskal-Wallis 3.125086e-06
## 55     [Clostridium] methylpentosum group_F Kruskal-Wallis 2.364971e-18
## 56                      Marinilabiliaceae_F Kruskal-Wallis 2.857166e-19
## 57                      Flavobacteriaceae_F Kruskal-Wallis 1.274939e-05
## 58                      Staphylococcaceae_F Kruskal-Wallis 4.246920e-22
## 59                     Porphyromonadaceae_F Kruskal-Wallis 5.124256e-12
## 60                        Eggerthellaceae_F Kruskal-Wallis 2.117931e-18
## 61                        Barnesiellaceae_F Kruskal-Wallis 2.522551e-07
## 62                          Monoglobaceae_F Kruskal-Wallis 2.153674e-06
## 63                     Bifidobacteriaceae_F Kruskal-Wallis 1.699744e-15
## 64                            Bacillaceae_F Kruskal-Wallis 1.838790e-09
## 65                 Hungateiclostridiaceae_F Kruskal-Wallis 1.127118e-12
## 66                         Marinifilaceae_F Kruskal-Wallis 7.717807e-11
## 67                       Streptococcaceae_F Kruskal-Wallis 6.582098e-14
## 68                  Peptostreptococcaceae_F Kruskal-Wallis 1.042119e-16
## 69              Erysipelatoclostridiaceae_F Kruskal-Wallis 3.073290e-13
## 70                         Clostridiaceae_F Kruskal-Wallis 9.938201e-16
## 71                    Erysipelotrichaceae_F Kruskal-Wallis 1.671406e-05
## 72                         Tannerellaceae_F Kruskal-Wallis 4.258468e-05
## 73  [Eubacterium] coprostanoligenes group_F Kruskal-Wallis 2.001438e-06
## 74                      Butyricicoccaceae_F Kruskal-Wallis 2.897180e-05
## 75                         Muribaculaceae_F Kruskal-Wallis 3.758091e-05
## 76                        Ruminococcaceae_F Kruskal-Wallis 1.458702e-08
## 77                           Halobacillus_G Kruskal-Wallis 4.880366e-05
## 78                              Megamonas_G Kruskal-Wallis 2.480778e-05
## 79                              Hespellia_G Kruskal-Wallis 1.148969e-05
## 80            Lachnospiraceae NK4B4 group_G Kruskal-Wallis 3.406312e-05
## 81                                XBB1006_G Kruskal-Wallis 4.544741e-06
## 82                     Macellibacteroides_G Kruskal-Wallis 8.252963e-10
## 83                           Paludibacter_G Kruskal-Wallis 7.121028e-06
## 84                            Anaerovorax_G Kruskal-Wallis 3.805548e-08
## 85                 Candidatus Phytoplasma_G Kruskal-Wallis 7.466909e-06
## 86                            Selenomonas_G Kruskal-Wallis 1.912782e-05
## 87                Lachnospiraceae UCG-009_G Kruskal-Wallis 9.999884e-06
## 88            [Eubacterium] nodatum group_G Kruskal-Wallis 2.465934e-07
## 89                          Mogibacterium_G Kruskal-Wallis 4.320301e-05
## 90                          Papillibacter_G Kruskal-Wallis 4.897195e-06
## 91                           Anaerococcus_G Kruskal-Wallis 3.272729e-12
## 92                                 CAG-56_G Kruskal-Wallis 9.881041e-07
## 93                         Proteiniphilum_G Kruskal-Wallis 4.464993e-13
## 94                      Ruminiclostridium_G Kruskal-Wallis 1.142188e-05
## 95                        Syntrophococcus_G Kruskal-Wallis 2.488686e-12
## 96                     Pseudobutyrivibrio_G Kruskal-Wallis 6.026604e-06
## 97                          Butyricimonas_G Kruskal-Wallis 1.196336e-05
## 98                      Caproiciproducens_G Kruskal-Wallis 1.550165e-05
## 99                 Prevotellaceae UCG-004_G Kruskal-Wallis 4.891415e-14
## 100                               UBA1819_G Kruskal-Wallis 1.981127e-19
## 101                           Barnesiella_G Kruskal-Wallis 3.908806e-06
## 102           Erysipelotrichaceae UCG-003_G Kruskal-Wallis 6.623150e-10
## 103           Rikenellaceae RC9 gut group_G Kruskal-Wallis 3.864326e-16
## 104                         Porphyromonas_G Kruskal-Wallis 1.759612e-14
## 105                        Staphylococcus_G Kruskal-Wallis 1.471866e-23
## 106                                    A2_G Kruskal-Wallis 3.435145e-21
## 107          Lachnospiraceae NC2004 group_G Kruskal-Wallis 2.010754e-07
## 108                       Bifidobacterium_G Kruskal-Wallis 1.343449e-14
## 109                            Monoglobus_G Kruskal-Wallis 2.153674e-06
## 110         Lachnospiraceae XPB1014 group_G Kruskal-Wallis 8.116506e-06
## 111                            Romboutsia_G Kruskal-Wallis 3.566692e-13
## 112                           Odoribacter_G Kruskal-Wallis 9.910652e-11
## 113                        Acetitomaculum_G Kruskal-Wallis 2.040740e-06
## 114                            Tyzzerella_G Kruskal-Wallis 3.609278e-05
## 115           Clostridium sensu stricto 1_G Kruskal-Wallis 2.878502e-17
## 116                        Marvinbryantia_G Kruskal-Wallis 1.915920e-11
## 117                         NK4A214 group_G Kruskal-Wallis 1.881102e-08
## 118          Lachnospiraceae NK3A20 group_G Kruskal-Wallis 1.638832e-05
## 119                         Streptococcus_G Kruskal-Wallis 6.665449e-13
## 120                      Fusicatenibacter_G Kruskal-Wallis 1.087316e-07
## 121                                 Dorea_G Kruskal-Wallis 1.466599e-05
## 122                          Anaerostipes_G Kruskal-Wallis 2.997463e-10
## 123                       Subdoligranulum_G Kruskal-Wallis 5.234297e-15
## 124                       Parabacteroides_G Kruskal-Wallis 3.893385e-05
## 125          [Ruminococcus] torques group_G Kruskal-Wallis 9.216209e-09
## 126                      Faecalibacterium_G Kruskal-Wallis 7.571240e-10
## 127                     Lachnoclostridium_G Kruskal-Wallis 1.729484e-07
## 128         Lachnospiraceae NK4A136 group_G Kruskal-Wallis 4.135324e-10
## 129                          Agathobacter_G Kruskal-Wallis 4.325760e-06
##     Normal_Distribution Adjusted_P_Value_Bonferroni Adjusted_P_Value_Holm
## 1                    No                3.084681e-04          2.796972e-04
## 2                    No                6.576800e-04          5.955614e-04
## 3                    No                4.705606e-07          4.438937e-07
## 4                    No                2.636142e-06          2.458740e-06
## 5                    No                1.239347e-12          1.204229e-12
## 6                    No                5.541434e-03          4.900276e-03
## 7                    No                5.349862e-20          5.343545e-20
## 8                    No                7.136139e-06          6.596927e-06
## 9                    No                5.590534e-20          5.577333e-20
## 10                   No                2.331423e-03          2.083692e-03
## 11                   No                4.684268e-02          3.981905e-02
## 12                   No                2.707496e-06          2.522095e-06
## 13                   No                3.172999e-06          2.944483e-06
## 14                   No                1.587370e-03          1.431819e-03
## 15                   No                1.103337e-12          1.074679e-12
## 16                   No                1.314644e-05          1.210652e-05
## 17                   No                2.861738e-16          2.821194e-16
## 18                   No                4.848815e-07          4.568305e-07
## 19                   No                7.026907e-05          6.437875e-05
## 20                   No                2.075481e-18          2.060779e-18
## 21                   No                1.824162e-03          1.638946e-03
## 22                   No                1.439683e-12          1.397190e-12
## 23                   No                4.413080e-14          4.324506e-14
## 24                   No                6.302558e-07          5.923065e-07
## 25                   No                8.256495e-05          7.554644e-05
## 26                   No                3.964750e-19          3.941345e-19
## 27                   No                1.976691e-11          1.906678e-11
## 28                   No                6.362102e-07          5.971512e-07
## 29                   No                8.321561e-08          7.889272e-08
## 30                   No                7.400242e-14          7.242976e-14
## 31                   No                3.637598e-02          3.109352e-02
## 32                   No                4.138427e-03          3.674259e-03
## 33                   No                4.684268e-02          3.981905e-02
## 34                   No                1.881618e-02          1.628366e-02
## 35                   No                1.693644e-02          1.467692e-02
## 36                   No                2.546547e-04          2.312036e-04
## 37                   No                3.725845e-02          3.175986e-02
## 38                   No                4.349188e-06          4.025695e-06
## 39                   No                2.202096e-03          1.970707e-03
## 40                   No                9.541384e-17          9.451265e-17
## 41                   No                2.178950e-06          2.034887e-06
## 42                   No                3.704493e-19          3.686998e-19
## 43                   No                1.480770e-16          1.465035e-16
## 44                   No                5.849823e-09          5.566656e-09
## 45                   No                2.366120e-02          2.042070e-02
## 46                   No                2.707496e-06          2.522095e-06
## 47                   No                3.172999e-06          2.944483e-06
## 48                   No                1.668365e-02          1.447755e-02
## 49                   No                2.712726e-06          2.522095e-06
## 50                   No                1.103337e-12          1.074679e-12
## 51                   No                4.797578e-15          4.712615e-15
## 52                   No                1.314644e-05          1.210652e-05
## 53                   No                8.798173e-04          7.946401e-04
## 54                   No                2.646948e-03          2.362565e-03
## 55                   No                2.003131e-15          1.970021e-15
## 56                   No                2.420020e-16          2.388591e-16
## 57                   No                1.079873e-02          9.447299e-03
## 58                   No                3.597141e-19          3.584400e-19
## 59                   No                4.340245e-09          4.135275e-09
## 60                   No                1.793888e-15          1.766355e-15
## 61                   No                2.136601e-04          1.942364e-04
## 62                   No                1.824162e-03          1.638946e-03
## 63                   No                1.439683e-12          1.397190e-12
## 64                   No                1.557456e-06          1.456322e-06
## 65                   No                9.546691e-10          9.129658e-10
## 66                   No                6.536983e-08          6.205117e-08
## 67                   No                5.575037e-11          5.364410e-11
## 68                   No                8.826749e-14          8.628746e-14
## 69                   No                2.603076e-10          2.501658e-10
## 70                   No                8.417656e-13          8.208954e-13
## 71                   No                1.415681e-02          1.231826e-02
## 72                   No                3.606922e-02          3.087389e-02
## 73                   No                1.695218e-03          1.527098e-03
## 74                   No                2.453911e-02          2.114941e-02
## 75                   No                3.183103e-02          2.732132e-02
## 76                   No                1.235521e-05          1.139246e-05
## 77                   No                4.133670e-02          3.518744e-02
## 78                   No                2.101219e-02          1.815930e-02
## 79                   No                9.731768e-03          8.536841e-03
## 80                   No                2.885146e-02          2.483201e-02
## 81                   No                3.849396e-03          3.422190e-03
## 82                   No                6.990259e-07          6.544600e-07
## 83                   No                6.031510e-03          5.326529e-03
## 84                   No                3.223299e-05          2.956911e-05
## 85                   No                6.324472e-03          5.577781e-03
## 86                   No                1.620126e-02          1.407808e-02
## 87                   No                8.469902e-03          7.449913e-03
## 88                   No                2.088646e-04          1.901235e-04
## 89                   No                3.659295e-02          3.123578e-02
## 90                   No                4.147924e-03          3.677793e-03
## 91                   No                2.772002e-09          2.644365e-09
## 92                   No                8.369242e-04          7.568877e-04
## 93                   No                3.781849e-10          3.625574e-10
## 94                   No                9.674336e-03          8.497882e-03
## 95                   No                2.107917e-09          2.013347e-09
## 96                   No                5.104534e-03          4.519953e-03
## 97                   No                1.013297e-02          8.876815e-03
## 98                   No                1.312990e-02          1.145572e-02
## 99                   No                4.143029e-11          3.991395e-11
## 100                  No                1.678015e-16          1.658204e-16
## 101                  No                3.310759e-03          2.951148e-03
## 102                  No                5.609808e-07          5.278651e-07
## 103                  No                3.273084e-13          3.195798e-13
## 104                  No                1.490392e-11          1.439363e-11
## 105                  No                1.246671e-20          1.246671e-20
## 106                  No                2.909568e-18          2.885522e-18
## 107                  No                1.703109e-04          1.552302e-04
## 108                  No                1.137902e-11          1.100285e-11
## 109                  No                1.824162e-03          1.638946e-03
## 110                  No                6.874681e-03          6.054913e-03
## 111                  No                3.020988e-10          2.899721e-10
## 112                  No                8.394322e-08          7.948343e-08
## 113                  No                1.728507e-03          1.555044e-03
## 114                  No                3.057059e-02          2.627555e-02
## 115                  No                2.438091e-14          2.392035e-14
## 116                  No                1.622785e-08          1.542316e-08
## 117                  No                1.593293e-05          1.463497e-05
## 118                  No                1.388090e-02          1.209458e-02
## 119                  No                5.645635e-10          5.405679e-10
## 120                  No                9.209569e-05          8.415828e-05
## 121                  No                1.242210e-02          1.085283e-02
## 122                  No                2.538851e-07          2.400967e-07
## 123                  No                4.433449e-12          4.292123e-12
## 124                  No                3.297697e-02          2.826597e-02
## 125                  No                7.806129e-06          7.207076e-06
## 126                  No                6.412840e-07          6.011565e-07
## 127                  No                1.464873e-04          1.336891e-04
## 128                  No                3.502620e-07          3.308260e-07
## 129                  No                3.663919e-03          3.261623e-03
##     Adjusted_P_Value_BH test_pass
## 1          3.855851e-06         Y
## 2          8.119506e-06         Y
## 3          9.603279e-09         Y
## 4          4.447092e-08         Y
## 5          4.957386e-14         Y
## 6          5.597408e-05         Y
## 7          1.863511e-20         Y
## 8          1.097868e-07         Y
## 9          1.863511e-20         Y
## 10         2.562004e-05         Y
## 11         3.631216e-04         Y
## 12         4.447092e-08         Y
## 13         5.036506e-08         Y
## 14         1.889727e-05         Y
## 15         4.597237e-14         Y
## 16         1.905281e-07         Y
## 17         2.201337e-17         Y
## 18         9.697631e-09         Y
## 19         9.759594e-07         Y
## 20         2.964973e-19         Y
## 21         2.049620e-05         Y
## 22         5.332161e-14         Y
## 23         2.451711e-15         Y
## 24         1.187563e-08         Y
## 25         1.131027e-06         Y
## 26         6.607916e-20         Y
## 27         6.376422e-13         Y
## 28         1.187563e-08         Y
## 29         1.824853e-09         Y
## 30         3.894864e-15         Y
## 31         2.927436e-04         Y
## 32         4.276210e-05         Y
## 33         3.631216e-04         Y
## 34         1.636190e-04         Y
## 35         1.485653e-04         Y
## 36         3.223477e-06         Y
## 37         2.957020e-04         Y
## 38         6.795606e-08         Y
## 39         2.446774e-05         Y
## 40         1.060154e-17         Y
## 41         3.822719e-08         Y
## 42         6.607916e-20         Y
## 43         1.480770e-17         Y
## 44         1.392815e-10         Y
## 45         2.022325e-04         Y
## 46         4.447092e-08         Y
## 47         5.036506e-08         Y
## 48         1.476429e-04         Y
## 49         4.447092e-08         Y
## 50         4.597237e-14         Y
## 51         2.998486e-16         Y
## 52         1.905281e-07         Y
## 53         1.060021e-05         Y
## 54         2.877117e-05         Y
## 55         1.335421e-16         Y
## 56         2.016683e-17         Y
## 57         1.009228e-04         Y
## 58         6.607916e-20         Y
## 59         1.058596e-10         Y
## 60         1.281348e-16         Y
## 61         2.739232e-06         Y
## 62         2.049620e-05         Y
## 63         5.332161e-14         Y
## 64         2.781171e-08         Y
## 65         2.512287e-11         Y
## 66         1.485678e-09         Y
## 67         1.689405e-12         Y
## 68         4.413374e-15         Y
## 69         7.656107e-12         Y
## 70         3.826207e-14         Y
## 71         1.275388e-04         Y
## 72         2.927436e-04         Y
## 73         1.994375e-05         Y
## 74         2.079586e-04         Y
## 75         2.630664e-04         Y
## 76         1.844061e-07         Y
## 77         3.254858e-04         Y
## 78         1.811396e-04         Y
## 79         9.268351e-05         Y
## 80         2.424492e-04         Y
## 81         4.051996e-05         Y
## 82         1.270956e-08         Y
## 83         6.031510e-05         Y
## 84         4.539858e-07         Y
## 85         6.261853e-05         Y
## 86         1.446541e-04         Y
## 87         8.223205e-05         Y
## 88         2.712527e-06         Y
## 89         2.927436e-04         Y
## 90         4.276210e-05         Y
## 91         6.930005e-11         Y
## 92         1.020639e-05         Y
## 93         1.050514e-11         Y
## 94         9.268351e-05         Y
## 95         5.404915e-11         Y
## 96         5.208708e-05         Y
## 97         9.559404e-05         Y
## 98         1.204578e-04         Y
## 99         1.294696e-12         Y
## 100        1.525468e-17         Y
## 101        3.559956e-05         Y
## 102        1.099962e-08         Y
## 103        1.558612e-14         Y
## 104        4.967972e-13         Y
## 105        1.246671e-20         Y
## 106        3.636960e-19         Y
## 107        2.240932e-06         Y
## 108        3.923799e-13         Y
## 109        2.049620e-05         Y
## 110        6.739883e-05         Y
## 111        8.631396e-12         Y
## 112        1.824853e-09         Y
## 113        2.009892e-05         Y
## 114        2.547549e-04         Y
## 115        1.434171e-15         Y
## 116        3.773918e-10         Y
## 117        2.276133e-07         Y
## 118        1.261900e-04         Y
## 119        1.525847e-11         Y
## 120        1.244536e-06         Y
## 121        1.150194e-04         Y
## 122        5.401810e-09         Y
## 123        1.583375e-13         Y
## 124        2.703030e-04         Y
## 125        1.182747e-07         Y
## 126        1.187563e-08         Y
## 127        1.953164e-06         Y
## 128        7.297125e-09         Y
## 129        3.897786e-05         Y
combined_bacteria_G <- combined_bacteria_clean %>%
  select(Health_state, ends_with("_G"))

d <- dist(combined_bacteria_G) # euclidean distances between the rows
## Warning in dist(combined_bacteria_G): в результате преобразования созданы NA
fit <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim

df_mds <- data.frame(
  x = fit$points[,1],
  y = fit$points[,2]
  )  

df_full <- cbind(df_mds, combined_info) %>% mutate(Health_state_n = case_when(Health_state == "Health"  ~ 0,
                                                                              Health_state == "Disease" ~ 1))

ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
  geom_point() +
  theme_bw() +
  ggtitle("Распределение вектора таксонов в зависимости от группы пациентов")

adonis2(d ~ Health_state_n, data = df_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2      F Pr(>F)    
## Health_state_n   1     2594 0.02239 8.6805  0.001 ***
## Residual       379   113256 0.97761                  
## Total          380   115850 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Тест Манна-Уитни для сравнения каждого таксона между больными и здоровыми

Wilcox_comparison_round_0 <- data_wide %>% 
  select(Health_state, Archaea, Bacteria,
         ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>% 
  mutate(across (where (is.numeric), function (x) round (x,0))) %>% 
  summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>% 
  pivot_longer(everything()) %>% 
  rename (Taxon = name, p_value = value) %>% 
  filter (p_value <= 0.05 ) %>% 
  arrange(p_value) %>% 
  add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>% 
  add_column(p_value_BH = p.adjust(.$p_value, "BH"))

 rbind (
   "Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_0),
 "Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_holm <= 0.05 )),
 "Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_BH <= 0.05 ))
 )
##                                                           [,1]
## Количество значимо различающихся таксонов по p_value       141
## Количество значимо различающихся таксонов по p_value_holm   66
## Количество значимо различающихся таксонов по p_value_BH    141
Wilcox_comparison_round_1 <- data_wide %>% 
  select(Health_state, Archaea, Bacteria,
         ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>% 
  mutate(across (where (is.numeric), function (x) round (x,1))) %>% 
  summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>% 
  pivot_longer(everything()) %>% 
  rename (Taxon = name, p_value = value) %>% 
  filter (p_value <= 0.05 ) %>% 
  arrange(p_value) %>% 
  add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>% 
  add_column(p_value_BH = p.adjust(.$p_value, "BH"))

 rbind (
   "Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_1),
 "Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_holm <= 0.05 )),
 "Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_BH <= 0.05 ))
 )
##                                                           [,1]
## Количество значимо различающихся таксонов по p_value       319
## Количество значимо различающихся таксонов по p_value_holm  181
## Количество значимо различающихся таксонов по p_value_BH    319
Wilcox_comparison_round_2 <- data_wide %>% 
  select(Health_state, Archaea, Bacteria,
         ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>% 
  mutate(across (where (is.numeric), function (x) round (x,2))) %>% 
  summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>% 
  pivot_longer(everything()) %>% 
  rename (Taxon = name, p_value = value) %>% 
  filter (p_value <= 0.05 ) %>% 
  arrange(p_value) %>% 
  add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>% 
  add_column(p_value_BH = p.adjust(.$p_value, "BH"))

 rbind (
   "Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_2),
 "Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_holm <= 0.05 )),
 "Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_BH <= 0.05 ))
 )
##                                                           [,1]
## Количество значимо различающихся таксонов по p_value       646
## Количество значимо различающихся таксонов по p_value_holm  430
## Количество значимо различающихся таксонов по p_value_BH    646

Распределения таксонов G(Genus - Род)

Гистограмма по количеству пациентов (Здоров - СРК)

# Получаем уникальные таксоны, удовлетворяющие условиям
unique_taxa <- Wilcox_comparison_round_2 %>%
  subset(grepl("_G", Taxon)) %>%
  filter(p_value_holm < 0.05) %>%
  distinct(Taxon)

# Определяем размер шага (20 таксонов на каждой итерации)
step_size <- 18

# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))

# Проходим по группам и строим графики
for (i in seq_along(taxon_groups)) {
  current_taxa <- taxon_groups[[i]]
  
  G_long_filtered <- G_long %>%
    filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
    group_by(Health_state, Taxon) %>%
    summarise(Count = n(), .groups = "drop")
  
  bar_plot <- ggplot(G_long_filtered, aes(x = Health_state, y = Count, fill = Health_state == "Disease")) +
    geom_bar(position = "dodge", color = "black", stat = "identity") +
    scale_fill_manual(values = c("skyblue", "red"), guide = FALSE) +
    labs(title = paste("Гистограмма для таксонов", (i - 1) * step_size + 1, "до", min(i * step_size, nrow(unique_taxa)), "в разрезе Health_state"),
         x = "Статус пациента",
         y = "Количество пациентов, у которых обнаружен данный таксон") +
    coord_flip() +
    facet_wrap(~Taxon, scales = "free_y", ncol = 2, shrink = 0.7) +  # Уменьшение пространства между фасетами
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5),  # Поворот текста на оси X и уменьшение шрифта
          axis.text.y = element_text(hjust = 1, size = 5),
          strip.text = element_text(size = 5))  # Уменьшение шрифта для названия фасет
  
  print(bar_plot)  # Отображение графика на экране
  
}
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

график солнышко не готов пока 😭

# Создаем набор данных с ограничением на первые 10 таксонов с разными Health_state

unique_taxa <- Wilcox_comparison_round_2 %>%
  subset(grepl("_G", Taxon)) %>%
  filter(p_value_holm < 0.05) %>%
  distinct(Taxon)

data <- G_long %>%
  filter(Taxon %in% unique_taxa$Taxon & Percentage > 0) %>%
  group_by(Taxon, Health_state) %>%
  summarise(Count = n(), .groups = "drop") %>%
  arrange(Taxon, Health_state) %>%
  slice_head(n = 20)

# Add a unique id for each Taxon
data <- data %>%
  mutate(id = group_indices(., Taxon))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `id = group_indices(., Taxon)`.
## Caused by warning:
## ! The `...` argument of `group_indices()` is deprecated as of dplyr 1.0.0.
## ℹ Please `group_by()` first
# Set a number of 'empty bar'
empty_bar <- 10

# Create empty rows with unique id for each Taxon
to_add <- data.frame(
  Taxon = rep(unique(data$Taxon), each = empty_bar),
  Health_state = rep(levels(data$Health_state), times = length(unique(data$Taxon))),
  Count = rep(NA, empty_bar),
  id = rep(seq_len(length(unique(data$Taxon))), each = empty_bar)
)

# Combine the initial dataset with the empty rows
data <- rbind(data, to_add) %>%
  arrange(Taxon, Health_state)

# Reset the id values
data$id <- seq_len(nrow(data))


# Исправление: создание переменной value на основе Count
data$value <- data$Count
 
# Get the name and the y position of each label
label_data <- data
number_of_bar <- nrow(label_data)/2
angle <- 90 - 360 * (label_data$id-0.5) / (number_of_bar * 2)
label_data$hjust <- ifelse(angle < -90, 1, 0)
label_data$angle <- ifelse(angle < -90, angle+180, angle)
 
# Make the plot
p <- ggplot(data, aes(x=as.factor(id), y=value, fill=Health_state)) +       
  geom_bar(stat="identity", position="stack", alpha=0.7) +
  scale_fill_manual(values = c("skyblue", "red")) +  # Используем только два цвета
  ylim(c(0, max(data$value) + 10)) +
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.margin = unit(rep(-1,4), "cm") 
  ) +
  coord_polar(start = 0) + 
  geom_text(data=label_data, aes(x=id, y=value+5, label=Taxon, hjust=hjust), color="black", fontface="bold", alpha=0.6, size=2.5, angle=label_data$angle, inherit.aes = FALSE ) +
  geom_text(aes(x=as.factor(id), y=value, label=Count), position=position_stack(vjust=0.5), size=2.5, color="black") +  # Добавление количества наблюдений
  labs(title = "График 'солнышко' для первых 10 пар таксонов с разными Health_state")

print(p)
## Warning: Removed 110 rows containing missing values (`position_stack()`).
## Warning: Removed 110 rows containing missing values (`position_stack()`).
## Warning: Removed 110 rows containing missing values (`geom_text()`).

Боксплот по возрасту

# Определяем размер шага (4 таксона на каждой итерации)
step_size <- 4

# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))

# Проходим по группам и строим боксплоты для переменной Age
for (i in seq_along(taxon_groups)) {
  current_taxa <- taxon_groups[[i]]
  
  G_long_filtered <- G_long %>%
    filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
  
  # Строим боксплот
  box_plot <- ggplot(G_long_filtered, aes(x = Health_state, y = Age, fill = Health_state)) +
    geom_boxplot() +
    labs(title = paste("Боксплоты для таксонов по статусу"),
         x = "Health_state",
         y = "Age") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),  # Уменьшаем размер шрифта для подписей Taxon
          axis.title.x = element_blank(),  # Убираем название оси X
          legend.position = "bottom") +  # Перемещаем легенду вниз
    scale_fill_manual(values = c("red", "skyblue")) +
    facet_grid(~Taxon, scales = "free_y", space = "free")  # Используем facet_grid вместо facet_wrap
  
  # Отображаем график на экране
  print(box_plot)
  
}

Боксплот по возросту и полу

# Определяем размер шага (3 таксона на каждой итерации)
step_size <- 3

# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))

# Проходим по группам и строим боксплоты для переменной Age
for (i in seq_along(taxon_groups)) {
  current_taxa <- taxon_groups[[i]]
  
  G_long_filtered <- G_long %>%
    filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
  
  # Строим боксплот
  box_plot <- ggplot(G_long_filtered, aes(x = interaction(Health_state, Sex), y = Age, fill = Health_state)) +
    geom_boxplot() +
    labs(title = paste("Боксплоты для таксона (Пол + статус заболевания)"),
         x = "Health_state + Sex",
         y = "Age") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),  # Уменьшаем размер шрифта для подписей Taxon
          axis.title.x = element_blank(),  # Убираем название оси X
          legend.position = "bottom") +  # Перемещаем легенду вниз
    scale_fill_manual(values = c("red", "skyblue")) +
    facet_grid(~Taxon, scales = "free_y", space = "free")  # Используем facet_grid вместо facet_wrap
  
  # Отображаем график на экране
  print(box_plot)
  
}

Боксплот по возросту и статусу курения

# Определяем размер шага (3 таксона на каждой итерации)
step_size <- 3

# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))

# Проходим по группам и строим боксплоты для переменной Age
for (i in seq_along(taxon_groups)) {
  current_taxa <- taxon_groups[[i]]
  
  G_long_filtered <- G_long %>%
    filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
  
  # Строим боксплот
  box_plot <- ggplot(G_long_filtered, aes(x = interaction(Health_state, Smoking), y = Age, fill = Health_state)) +
    geom_boxplot() +
    labs(title = paste("Боксплоты для таксона (Статус курения + статус заболевания)"),
         x = "Health_state + Smoking",
         y = "Age") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),  # Уменьшаем размер шрифта для подписей Taxon
          axis.title.x = element_blank(),  # Убираем название оси X
          legend.position = "bottom") +  # Перемещаем легенду вниз
    scale_fill_manual(values = c("red", "skyblue")) +
    facet_grid(~Taxon, scales = "free_y", space = "free")  # Используем facet_grid вместо facet_wrap
  
  # Отображаем график на экране
  print(box_plot)
  
}

Боксплот по возросту и статусу употребления алкоголя

# Определяем размер шага (3 таксона на каждой итерации)
step_size <- 3

# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))

# Проходим по группам и строим боксплоты для переменной Age
for (i in seq_along(taxon_groups)) {
  current_taxa <- taxon_groups[[i]]
  
  G_long_filtered <- G_long %>%
    filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
  
  # Строим боксплот
  box_plot <- ggplot(G_long_filtered, aes(x = interaction(Health_state, Alcohol), y = Age, fill = Health_state)) +
    geom_boxplot() +
    labs(title = paste("Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)"),
         x = "Health_state + Alcohol",
         y = "Age") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),  # Уменьшаем размер шрифта для подписей Taxon
          axis.title.x = element_blank(),  # Убираем название оси X
          legend.position = "bottom") +  # Перемещаем легенду вниз
    scale_fill_manual(values = c("red", "skyblue")) +
    facet_grid(~Taxon, scales = "free_y", space = "free")  # Используем facet_grid вместо facet_wrap
  
  # Отображаем график на экране
  print(box_plot)
  
}